{% raw %}
groups:
- name: node_alert
  rules:
  - alert: node_down
    expr: up == 0
    for: 1m
    labels:
      level: P0
      type: node
    annotations:
      summary: "{{ $labels.job }} 服务器宕机!"
      description: "服务器 {{ $labels.instance }} 宕机!"

  - alert: "CPU使用情况"
    expr: 100 -avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance)* 100 > 90
    for: 2h
    labels:
      level: P1
      type: node
    annotations:
      summary: "{{ $labels.job }} CPU 使用率过高！"
      description: "instance: {{ $labels.instance }} ,cpu usage is too high ! value: {{ $value }}"

  - alert: "内存使用"
    expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
    for: 2h
    labels:
      level: P1
      type: node
    annotations:
      summary: "{{ $labels.job }} 内存使用率过高！"
      description: "{{ $labels.instance }} 内存使用大于80%(目前使用:{{ $value}}%)"

  - alert: NodeFilesystemUsage
    expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 90
    for: 6h
    labels:
      level: P1
      type: node
    annotations:
      summary: "{{ $labels.job }} : {{ $labels.instance }} 分区使用率过高"
      description: "{{ $labels.instance }}: {{ $labels.mountpoint }} 分区使用大于90% (当前值: {{ $value }})"

  - alert: OpenFilesUsage
    expr: (process_open_fds / process_max_fds) *100 > 90
    for: 1h
    labels:
      level: P1
      type: node
    annotations:
      summary: "{{ $labels.job }} : {{ $labels.instance }} 打开文件数过高！"
      description: "{{ $labels.instance }} 打开文件数超过最大打开文件数的90%！"
{% endraw %}
